Note: Using vst normalisation (just for performing the DEA from the limma package) and filtering out genes with a raw expression sum below 40
# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columns in datExpr don\'t match the rowd in datMeta!')
}
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#################################################################################
# FILTERS:
# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
# 3. Filter samples with rowSums <= 40
to_keep = rowSums(datExpr)>40
datExpr = datExpr[to_keep,]
datProbes = datProbes[to_keep,]
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
datExpr_backup = datExpr
rm(getinfo, to_keep, gene_names, mart, GO_annotations)
Number of genes:
nrow(datExpr)
## [1] 33478
################################################################################
# Calculate Differential Expression using limma and DESeq2 packages
# Limma
if(!file.exists('./../working_data/genes_ASD_DE_info_limma.csv')){
# First perform vst normalisation
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_)
dds = estimateSizeFactors(dds)
vst_output = vst(dds)
datExpr_vst = assay(vst_output)
# Calculate DE of normalised data
mod = model.matrix(~ Diagnosis_, data=datMeta)
corfit = duplicateCorrelation(datExpr_vst, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr_vst, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_vst))
DE_info_limma = top_genes[match(rownames(datExpr_vst), rownames(top_genes)),] %>%
rownames_to_column(var = 'ID') %>%
mutate('logFC_limma'=logFC, 'adj.P.Val_limma'=adj.P.Val) %>%
dplyr::select(ID, logFC_limma, adj.P.Val_limma)
write.csv(DE_info_limma, './../working_data/genes_ASD_DE_info_limma.csv', row.names = FALSE)
rm(mod, corfit, lmfit, fit, top_genes)
} else DE_info_limma = read.csv('./../working_data/genes_ASD_DE_info_limma.csv')
# DESeq2 DE
if(!file.exists('./../working_data/genes_ASD_DE_info_DESeq2.csv')){
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
ddsSE = DESeqDataSet(se, design =~Diagnosis_)
dds = DESeq(ddsSE)
DE_info_DESeq2 = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
mutate('logFC_DESeq2'=log2FoldChange, 'adj.P.Val_DESeq2'=padj) %>%
dplyr::select(ID, logFC_DESeq2, adj.P.Val_DESeq2)
write.csv(DE_info_DESeq2, './../working_data/genes_ASD_DE_info_DESeq2.csv', row.names = FALSE)
rm(counts, rowRanges, se, ddsSE, dds)
} else DE_info_DESeq2 = read.csv('./../working_data/genes_ASD_DE_info_DESeq2.csv')
DE_info = DE_info_limma %>% left_join(DE_info_DESeq2, by='ID')
rm(DE_info_limma, DE_info_DESeq2)
It seems like there are two main lines of points
ggplotly(DE_info %>% ggplot(aes(logFC_DESeq2, logFC_limma)) + geom_point(alpha=0.05, aes(id=ID)) +
geom_vline(xintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_vline(xintercept= log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept= log2(1.2), color='#cc0099', size=0.5) +
geom_smooth(method=lm, se=F, color='#009999', size=0.5) + theme_minimal() + coord_fixed() +
ggtitle(glue('LogFC (corr=',round(cor(DE_info$logFC_limma, DE_info$logFC_DESeq2),2),')')))
Although both functions use the BH method to adjust for multiple testing, the limma package seems to be more strict, assigning a high p-value to many of the points that DESeq2 considers statistically significant
DE_info %>% ggplot(aes(adj.P.Val_DESeq2, adj.P.Val_limma)) + geom_point(alpha=0.05, aes(id=ID)) +
geom_vline(xintercept=0.05, color='#cc0099') + geom_hline(yintercept=0.05, color='#cc0099') +
geom_smooth(method=lm, se=F, color='#009999', size=0.5) + theme_minimal() + coord_fixed() +
ggtitle(glue('Adjusted p-values (corr=',round(cor(DE_info$adj.P.Val_limma,
DE_info$adj.P.Val_DESeq2),2),')'))
signif_genes = data.frame('limma Significance' = DE_info$adj.P.Val_limma<0.05 & DE_info$logFC_limma>log2(1.2),
'DESeq2 Significance' = DE_info$adj.P.Val_DESeq2<0.05 & DE_info$logFC_DESeq2>log2(1.2),
'ID' = DE_info$ID)
signif_genes %>% dplyr::select(-ID) %>% table
## DESeq2.Significance
## limma.Significance FALSE TRUE
## FALSE 31568 481
## TRUE 184 1245
signif_genes %>% dplyr::select(-ID) %>% table/nrow(DE_info)*100
## DESeq2.Significance
## limma.Significance FALSE TRUE
## FALSE 94.2947607 1.4367644
## TRUE 0.5496147 3.7188601
print(glue(round(sum(signif_genes$limma.Significance & !signif_genes$DESeq2.Significance)/sum(signif_genes$limma.Significance)*100,2),
'% of the statistically significant limma genes are not significant for DESeq2.
', round(sum(!signif_genes$limma.Significance & signif_genes$DESeq2.Significance)/sum(signif_genes$DESeq2.Significance)*100,2),
'% of the statistically significant DESeq2 genes are not significant for limma.'))
## 12.88% of the statistically significant limma genes are not significant for DESeq2.
## 27.87% of the statistically significant DESeq2 genes are not significant for limma.
SFARI score distribution of genes classified as significant by both methods
both_methods = data.frame('ID'=signif_genes %>% filter(limma.Significance & DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(both_methods$`gene-score`, useNA='ifany')
##
## 1 2 3 4 5 6 <NA>
## 2 3 18 47 27 2 1146
round(table(both_methods$`gene-score`, useNA='ifany')/nrow(both_methods)*100,2)
##
## 1 2 3 4 5 6 <NA>
## 0.16 0.24 1.45 3.78 2.17 0.16 92.05
rm(both_methods)
SFARI score distribution of genes classified as significant by limma but not DESeq2
only_limma = data.frame('ID'=signif_genes %>% filter(limma.Significance & !DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(only_limma$`gene-score`, useNA='ifany')
##
## 2 3 4 5 6 <NA>
## 2 3 3 5 1 170
round(table(only_limma$`gene-score`, useNA='ifany')/nrow(only_limma)*100,2)
##
## 2 3 4 5 6 <NA>
## 1.09 1.63 1.63 2.72 0.54 92.39
rm(only_limma)
SFARI score distribution of genes classified as significant by DESeq2 but not limma
only_DESeq2 = data.frame('ID'=signif_genes %>% filter(!limma.Significance & DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(only_DESeq2$`gene-score`, useNA='ifany')
##
## 2 3 4 5 <NA>
## 2 2 6 3 468
round(table(only_DESeq2$`gene-score`, useNA='ifany')/nrow(only_DESeq2)*100,2)
##
## 2 3 4 5 <NA>
## 0.42 0.42 1.25 0.62 97.30
rm(only_DESeq2)
SFARI score distribution of genes classified as not significant by both methods
neither = data.frame('ID'=signif_genes %>% filter(!limma.Significance & !DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(neither$`gene-score`, useNA='ifany')
##
## 1 2 3 4 5 6 <NA>
## 22 53 165 377 132 21 30799
round(table(neither$`gene-score`, useNA='ifany')/nrow(neither)*100,2)
##
## 1 2 3 4 5 6 <NA>
## 0.07 0.17 0.52 1.19 0.42 0.07 97.56
rm(neither, signif_genes)